import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import eli5
from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from typing import List, Union, Dict
# Warnings will be used to silence various model warnings for tidier output
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
np.random.seed(0)
german_xai=pd.read_csv('C:/Users/krish/Downloads/German-mapped.csv')
german_xai=pd.get_dummies(german_xai,columns=['CreditHistory','Purpose','Savings','EmployDuration','Debtors','Collateral','OtherPayBackPlan','Property','Job'])
german_xai.head()
german_xai.columns
german_xai = german_xai.reindex(columns=['NumMonths', 'CreditAmount', 'PayBackPercent', 'Gender',
'ResidenceDuration', 'Age', 'ExistingCredit', 'Dependents', 'Telephone',
'Foreignworker', 'Marital_Status',
'CreditHistory_Delay', 'CreditHistory_none/paid', 'CreditHistory_other',
'Purpose_CarNew', 'Purpose_CarUsed', 'Purpose_biz',
'Purpose_domestic app', 'Purpose_education', 'Purpose_furniture/equip',
'Purpose_others', 'Purpose_radio/tv', 'Purpose_repairs',
'Purpose_retraining', 'Savings_500+', 'Savings_<500', 'Savings_none',
'EmployDuration_1-4 yr', 'EmployDuration_4-7 yr',
'EmployDuration_<1 yr', 'EmployDuration_>=7 yr',
'EmployDuration_unemployed', 'Debtors_co-applicant',
'Debtors_guarantor', 'Debtors_none', 'Collateral_car/other',
'Collateral_real estate', 'Collateral_savings/life insurance',
'Collateral_unknown/none', 'OtherPayBackPlan_bank',
'OtherPayBackPlan_none', 'OtherPayBackPlan_stores', 'Property_free',
'Property_own', 'Property_rent',
'Job_management/self-emp/officer/highly qualif emp',
'Job_skilled employee', 'Job_unemp/unskilled-non resident',
'Job_unskilled-resident', 'CreditStatus'])
german_xai.head()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
german_xai[['CreditAmount']]=scaler.fit_transform(german_xai[['CreditAmount']])
german_xai.to_csv('C:/Users/krish/Downloads/German-encoded.csv', index=False)
X = german_xai.iloc[:, :-1]
y = german_xai['CreditStatus']
X.head()
y.head()
X_train,X_test,y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=40,stratify=y)
german_xai.dtypes
import klib
klib.missingval_plot(X)
klib.missingval_plot(y)
from sklearn.feature_selection import mutual_info_classif
mutual_info=mutual_info_classif(X_train, y_train,random_state=40)
mutual_info
Estimate mutual information for a discrete target variable.
Mutual information (MI) [1] between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.
X.columns
mutual_info=pd.Series(mutual_info)
mutual_info.index=X_train.columns
mutual_info.sort_values(ascending=False)
mutual_info.sort_values(ascending=False).plot.bar(figsize=(15,5))
Selecting top 10 features having highest dependencies w.r.to target variable CreditStatus
mutual_info.sort_values(ascending=False)[0:22]
german_xai_imp=german_xai[['Gender','Age','Marital_Status','NumMonths','Savings_<500','Savings_none','Dependents','Property_rent',
'Job_management/self-emp/officer/highly qualif emp','Debtors_guarantor','Purpose_CarNew',
'Purpose_furniture/equip','CreditHistory_none/paid','Purpose_CarUsed','CreditAmount',
'Collateral_real estate','Debtors_none','Job_unemp/unskilled-non resident','Purpose_others',
'CreditHistory_other','PayBackPercent','Collateral_unknown/none','Purpose_education', 'CreditStatus']]
german_xai_imp.head()
corrMatrix = round(german_xai_imp.corr(),1)
corrMatrix
klib.corr_plot(german_xai_imp,annot=False)
klib.corr_plot(german_xai_imp,target='CreditStatus')
No higher correlation is observed between input variables (except gender, marital status (0.7) and credit amount, num of months (0.6) and between target variable and input variables. But since we are trying to understand the impact of protected variables let us retain them without dropping.
german_xai_imp.to_csv('C:/Users/krish/Downloads/German-reduced.csv', index=False)
#from sklearn.feature_selection import SelectPercentile
#selected_top=SelectPercentile(score_func=mutual_info_classif, percentile=20)
#from sklearn.feature_selection import SelectKBest
#selected_top=SelectKBest(mutual_info_classif,k=10)
#selected_top.fit_transform(X_train,y_train)
#selected_top.fit_transform(X_train,y_train)
#X_sig=X_train.columns[selected_top.get_support()]
#X_sig
#X_train_sig=pd.DataFrame(X_train,columns=X_sig)
#X_test_sig=pd.DataFrame(X_test,columns=X_sig)
#X_train_sig.head()
#X_train_sig.shape
#X_test_sig.head()
#X_test_sig.shape
from IPython.display import Image
Image(filename='C:/Users/krish/Desktop/list of protected variables.png',width=500,height=30)
From the above, we have 3 protected fields in our dataset:
1. Gender
2. Age
3. Marital Status
Now, let us identify previlege class in each protected attribute.
print(german_xai_imp['Gender'].value_counts())
german_xai_imp.groupby(['Gender'])['CreditStatus'].mean()
#https://arxiv.org/pdf/1810.01943.pdf, https://arxiv.org/pdf/2005.12379.pdf
Males(1) are more than females and for males(1) target variable CreditScore is more favorable having higher value for given number of males than female group average. Hence male(1) is privelieged class.
print(german_xai_imp['Age'].value_counts())
german_xai_imp.groupby(['Age'])['CreditStatus'].mean()
Age >26: 1; else 0; so ppl above 26 are more and group average of ppl with age >26 is higher than the group of age < 26 ,so age(1) is priveleiged group
print(german_xai_imp['Marital_Status'].value_counts())
german_xai_imp.groupby(['Marital_Status'])['CreditStatus'].mean()
Singles(1) are more than not singles and for singles(1) target variable CreditScore is more favorable having higher value for given number of singles than non singles group average. Hence singles(1) is privelieged group
BinaryLabelDataset: Base class for all structured datasets with binary labels.
# Fairness metrics
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.explainers import MetricTextExplainer
from aif360.metrics import ClassificationMetric
# Get DF into IBM format
from aif360 import datasets
aif_train_dataset = datasets.BinaryLabelDataset(favorable_label = 1, unfavorable_label = 0, df=german_xai_imp,
label_names=["CreditStatus"],
protected_attribute_names=["Age","Gender","Marital_Status"],
privileged_protected_attributes = [1,1,1])
dataset_orig_train, dataset_orig_test = aif_train_dataset.split([0.7], shuffle=True)
dataset_orig_train.feature_names
# Disparate impact measurement for gender
metric_aif_train_ready_gender = BinaryLabelDatasetMetric(
aif_train_dataset,
unprivileged_groups=[{"Age":0,"Gender":0,"Marital_Status":0}], privileged_groups=[{"Age":1,"Gender":1,"Marital_Status":1}])
explainer_aif_train_ready_gender = MetricTextExplainer(metric_aif_train_ready_gender)
print(explainer_aif_train_ready_gender.disparate_impact())
print("Difference in mean outcomes between unprivileged and privileged groups of gender = %f" % metric_aif_train_ready_gender.mean_difference())
from aif360.algorithms.preprocessing import Reweighing
privileged_groups = [{'Gender': 1}]
unprivileged_groups = [{'Gender': 0}]
RW_gender = Reweighing(unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
dataset_aif_tranf_gender = RW_gender.fit_transform(dataset_orig_train)
metric_transf_train_gender = BinaryLabelDatasetMetric(dataset_aif_tranf_gender,
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
print("Difference in mean outcomes between unprivileged and privileged groups of gender = %f" % metric_transf_train_gender.mean_difference())
# Disparate impact measurement for age
metric_aif_train_ready_age = BinaryLabelDatasetMetric(
aif_train_dataset,
unprivileged_groups=[{"Age":0}],
privileged_groups=[{"Age":1}])
explainer_aif_train_ready_age = MetricTextExplainer(metric_aif_train_ready_age)
print(explainer_aif_train_ready_age.disparate_impact())
print("Difference in mean outcomes between unprivileged and privileged groups of age = %f" % metric_aif_train_ready_age.mean_difference())
from aif360.algorithms.preprocessing import Reweighing
privileged_groups = [{'Age': 1}]
unprivileged_groups = [{'Age': 0}]
RW_age = Reweighing(unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
dataset_aif_tranf_age = RW_age.fit_transform(dataset_orig_train)
metric_transf_train_age = BinaryLabelDatasetMetric(dataset_aif_tranf_age,
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
print("Difference in mean outcomes between unprivileged and privileged groups of age = %f" % metric_transf_train_age.mean_difference())
metric_transf_train_age
# Disparate impact measurement for age
metric_aif_train_ready_marital = BinaryLabelDatasetMetric(
aif_train_dataset,
unprivileged_groups=[{"Marital_Status":0}],
privileged_groups=[{"Marital_Status":1}])
explainer_aif_train_ready_marital = MetricTextExplainer(metric_aif_train_ready_marital)
print(explainer_aif_train_ready_marital.disparate_impact())
print("Difference in mean outcomes between unprivileged and privileged groups of marital status = %f" % metric_aif_train_ready_marital.mean_difference())
from aif360.algorithms.preprocessing import Reweighing
privileged_groups = [{'Marital_Status': 1}]
unprivileged_groups = [{'Marital_Status': 0}]
RW_Marital = Reweighing(unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
dataset_aif_tranf_marital = RW_Marital.fit_transform(dataset_orig_train)
metric_transf_train_marital = BinaryLabelDatasetMetric(dataset_aif_tranf_marital,
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
print("Difference in mean outcomes between unprivileged and privileged groups of marital status = %f" % metric_transf_train_marital.mean_difference())
#Seting the Hyper Parameters
param_grid = {"max_depth": [3,5,7,None],
"n_estimators":[3,5,10,15,20],
"max_features": [4,7,15]}
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
#Creating the classifier
rf_model = RandomForestClassifier(random_state=40)
grid_search = GridSearchCV(rf_model, param_grid=param_grid, cv=5, scoring='recall', verbose=0)
model = grid_search
mdl_age = model.fit(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())
rf_shap_values = shap.KernelExplainer(grid_search.predict,dataset_aif_tranf_age.features)
importances = model.best_estimator_.feature_importances_
indices = np.argsort(importances)
features = dataset_aif_tranf_age.feature_names
#https://stackoverflow.com/questions/48377296/get-feature-importance-from-gridsearchcv
importances
plt.figure(figsize=(20,30))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Features that are important in the model are given above.
mdl_age.best_params_
type(model)
explainer = shap.TreeExplainer(grid_search.best_estimator_)
shap_values_a=explainer.shap_values(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())
#https://github.com/slundberg/shap/issues/968
shap_values_a
The shap_values[0] are explanations with respect to the negative class, while shap_values[1] are explanations with respect to the positive class.
Features in blue pushes the base value towards lowest values and features in red moves base levels towards higher values.
shap.initjs()
shap.force_plot(explainer.expected_value[0],shap_values_a[0][0], dataset_aif_tranf_age.feature_names)
#https://github.com/slundberg/shap
#https://github.com/slundberg/shap/issues/279
shap.initjs()
shap.force_plot(explainer.expected_value[1],shap_values_a[1][0], dataset_aif_tranf_age.feature_names)
dataset_aif_tranf_age.feature_names
shap.force_plot(explainer.expected_value[0],
shap_values_a[0][:,:], dataset_aif_tranf_age.features[:,:],feature_names = dataset_aif_tranf_age.feature_names)
shap.force_plot(explainer.expected_value[1],
shap_values_a[1][:,:], dataset_aif_tranf_age.features[:,:],feature_names = dataset_aif_tranf_age.feature_names)
p = shap.summary_plot(shap_values_a, dataset_aif_tranf_age.features, feature_names=dataset_aif_tranf_age.feature_names)
display(p)
Variables with higher impact are Age,CreditAmount,NumMonths,Savings etc
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[0], shap_values_a[0][0],feature_names=dataset_aif_tranf_age.feature_names)
Interpretation of graph: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html
The above explanation shows features each contributing to push the model output from the base value (the average model output over the training dataset we passed) to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue.
f(x)- model output impacted by features; E(f(x))- expected output.
One the fundemental properties of Shapley values is that they always sum up to the difference between the game outcome when all players are present and the game outcome when no players are present. For machine learning models this means that SHAP values of all the input features will always sum up to the difference between baseline (expected) model output and the current model output for the prediction being explained.
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[1], shap_values_a[1][0],feature_names=dataset_aif_tranf_age.feature_names)
#!pip install eli5
from eli5.sklearn import PermutationImportance
perm_age = PermutationImportance(model).fit(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())
perm_imp_1=eli5.show_weights(perm_age,feature_names = dataset_aif_tranf_age.feature_names)
perm_imp_1
plt.show()
eli5 provides a way to compute feature importances for any black-box estimator by measuring how score decreases when a feature is not available; the method is also known as “permutation importance” or “Mean Decrease Accuracy (MDA)”.
The first number in each row shows how much model performance decreased with a random shuffling (in this case, using "accuracy" as the performance metric).
Like most things in data science, there is some randomness to the exact performance change from a shuffling a column. We measure the amount of randomness in our permutation importance calculation by repeating the process with multiple shuffles. The number after the ± measures how performance varied from one-reshuffling to the next.
You'll occasionally see negative values for permutation importances. In those cases, the predictions on the shuffled (or noisy) data happened to be more accurate than the real data. This happens when the feature didn't matter (should have had an importance close to 0), but random chance caused the predictions on shuffled data to be more accurate. This is more common with small datasets, like the one in this example, because there is more room for luck/chance.
mdl_gender = model.fit(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
rf_shap_values = shap.KernelExplainer(grid_search.predict,dataset_aif_tranf_gender.features)
importances = model.best_estimator_.feature_importances_
indices = np.argsort(importances)
features = dataset_aif_tranf_gender.feature_names
#https://stackoverflow.com/questions/48377296/get-feature-importance-from-gridsearchcv
importances
plt.figure(figsize=(20,30))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
mdl_gender.best_params_
type(model)
explainer = shap.TreeExplainer(grid_search.best_estimator_)
shap_values_b=explainer.shap_values(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
#https://github.com/slundberg/shap/issues/968
shap_values_b
shap.initjs()
shap.force_plot(explainer.expected_value[0],shap_values_b[0][0], dataset_aif_tranf_gender.feature_names)
#https://github.com/slundberg/shap
#https://github.com/slundberg/shap/issues/279
The shap_values[0] are explanations with respect to the negative class, while shap_values[1] are explanations with respect to the positive class.
shap.initjs()
shap.force_plot(explainer.expected_value[1],shap_values_b[1][0], dataset_aif_tranf_gender.feature_names)
dataset_aif_tranf_gender.feature_names
shap.force_plot(explainer.expected_value[0],
shap_values_b[0][:,:], dataset_aif_tranf_gender.features[:,:],feature_names = dataset_aif_tranf_gender.feature_names)
shap.force_plot(explainer.expected_value[1],
shap_values_b[1][:,:], dataset_aif_tranf_gender.features[:,:],feature_names = dataset_aif_tranf_gender.feature_names)
p = shap.summary_plot(shap_values_b, dataset_aif_tranf_gender.features, feature_names=dataset_aif_tranf_gender.feature_names)
display(p)
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[0], shap_values_b[0][0],feature_names=dataset_aif_tranf_gender.feature_names)
Interpretation of graph: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html
f(x)- model output impacted by features; E(f(x))- expected output.
The above explanation shows features each contributing to push the model output from the base value (the average model output over the training dataset we passed) to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue.
One the fundemental properties of Shapley values is that they always sum up to the difference between the game outcome when all players are present and the game outcome when no players are present. For machine learning models this means that SHAP values of all the input features will always sum up to the difference between baseline (expected) model output and the current model output for the prediction being explained.
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[1], shap_values_b[1][0],feature_names=dataset_aif_tranf_gender.feature_names)
#!pip install eli5
from eli5.sklearn import PermutationImportance
perm_gender = PermutationImportance(model).fit(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
perm_imp_2=eli5.show_weights(perm_gender,feature_names = dataset_aif_tranf_gender.feature_names)
perm_imp_2
plt.show()
eli5 provides a way to compute feature importances for any black-box estimator by measuring how score decreases when a feature is not available; the method is also known as “permutation importance” or “Mean Decrease Accuracy (MDA)”.
mdl_marital = model.fit(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())
rf_shap_values = shap.KernelExplainer(grid_search.predict,dataset_aif_tranf_marital.features)
importances = model.best_estimator_.feature_importances_
indices = np.argsort(importances)
features = dataset_aif_tranf_marital.feature_names
#https://stackoverflow.com/questions/48377296/get-feature-importance-from-gridsearchcv
importances
plt.figure(figsize=(20,30))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
mdl_gender.best_params_
type(model)
explainer = shap.TreeExplainer(grid_search.best_estimator_)
shap_values_c=explainer.shap_values(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())
#https://github.com/slundberg/shap/issues/968
shap_values_c
shap.initjs()
shap.force_plot(explainer.expected_value[0],shap_values_c[0][0], dataset_aif_tranf_marital.feature_names)
#https://github.com/slundberg/shap
#https://github.com/slundberg/shap/issues/279
The shap_values[0] are explanations with respect to the negative class, while shap_values[1] are explanations with respect to the positive class.
shap.initjs()
shap.force_plot(explainer.expected_value[1],shap_values_c[1][0], dataset_aif_tranf_marital.feature_names)
dataset_aif_tranf_marital.feature_names
shap.force_plot(explainer.expected_value[0],
shap_values_c[0][:,:], dataset_aif_tranf_marital.features[:,:],feature_names = dataset_aif_tranf_marital.feature_names)
shap.force_plot(explainer.expected_value[1],
shap_values_c[1][:,:], dataset_aif_tranf_marital.features[:,:],feature_names = dataset_aif_tranf_marital.feature_names)
p = shap.summary_plot(shap_values_c, dataset_aif_tranf_marital.features, feature_names=dataset_aif_tranf_marital.feature_names)
display(p)
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[0], shap_values_c[0][0],feature_names=dataset_aif_tranf_marital.feature_names)
Interpretation of graph: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html
f(x)- model output impacted by features; E(f(x))- expected output.
The above explanation shows features each contributing to push the model output from the base value (the average model output over the training dataset we passed) to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue.
One the fundemental properties of Shapley values is that they always sum up to the difference between the game outcome when all players are present and the game outcome when no players are present. For machine learning models this means that SHAP values of all the input features will always sum up to the difference between baseline (expected) model output and the current model output for the prediction being explained.
shap.plots._waterfall.waterfall_legacy(explainer.expected_value[1], shap_values_c[1][0],feature_names=dataset_aif_tranf_marital.feature_names)
#!pip install eli5
from eli5.sklearn import PermutationImportance
perm_marital = PermutationImportance(model).fit(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())
perm_imp_3=eli5.show_weights(perm_marital,feature_names = dataset_aif_tranf_marital.feature_names)
perm_imp_3
plt.show()
eli5 provides a way to compute feature importances for any black-box estimator by measuring how score decreases when a feature is not available; the method is also known as “permutation importance” or “Mean Decrease Accuracy (MDA)”.
There are several different ways to calculate feature importances. By default, “gain” is used, that is the average gain of the feature when it is used in trees.
from xgboost import XGBClassifier
estimator = XGBClassifier(seed=40)
parameters = {
'max_depth': range (2, 10, 2),
'n_estimators': range(60, 240, 40),
'learning_rate': [0.1, 0.01, 0.05]
}
grid_search = GridSearchCV(
estimator=estimator,
param_grid=parameters,
scoring = 'recall',
cv = 5,
verbose=0
)
model=grid_search
#rf_shap_values = shap.KernelExplainer(grid_search.predict,dataset_aif_tranf_age.features)
mdl_age = model.fit(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
explainer = shap.TreeExplainer(grid_search.best_estimator_,dataset_aif_tranf_age.features)
shap_values=explainer.shap_values(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())
#https://github.com/slundberg/shap
shap_values
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[0,:], dataset_aif_tranf_age.feature_names)
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[1,:], dataset_aif_tranf_age.feature_names)
shap.force_plot(explainer.expected_value, shap_values[:,:], X.iloc[:,:],feature_names = dataset_aif_tranf_age.feature_names)
shap.plots._waterfall.waterfall_legacy(explainer.expected_value, shap_values[0,:],feature_names=dataset_aif_tranf_age.feature_names)
perm_age = PermutationImportance(model).fit(dataset_aif_tranf_age.features, dataset_aif_tranf_age.labels.ravel())
perm_imp=eli5.show_weights(perm_age,feature_names = dataset_aif_tranf_age.feature_names)
perm_imp
plt.show()
mdl_gender = model.fit(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
explainer = shap.TreeExplainer(grid_search.best_estimator_,dataset_aif_tranf_gender.features)
shap_values=explainer.shap_values(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
#https://github.com/slundberg/shap
shap_values
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[0,:], dataset_aif_tranf_gender.feature_names)
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[1,:], dataset_aif_tranf_gender.feature_names)
shap.force_plot(explainer.expected_value, shap_values[:,:], X.iloc[:,:],feature_names = dataset_aif_tranf_gender.feature_names)
shap.plots._waterfall.waterfall_legacy(explainer.expected_value, shap_values[0,:],feature_names=dataset_aif_tranf_gender.feature_names)
perm_gender = PermutationImportance(model).fit(dataset_aif_tranf_gender.features, dataset_aif_tranf_gender.labels.ravel())
perm_imp=eli5.show_weights(perm_gender,feature_names = dataset_aif_tranf_gender.feature_names)
perm_imp
plt.show()
mdl_marital = model.fit(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
explainer = shap.TreeExplainer(grid_search.best_estimator_,dataset_aif_tranf_marital.features)
shap_values=explainer.shap_values(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())
#https://github.com/slundberg/shap
shap_values
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[0,:], dataset_aif_tranf_marital.feature_names)
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[1,:], dataset_aif_tranf_marital.feature_names)
shap.force_plot(explainer.expected_value, shap_values[:,:], X.iloc[:,:],feature_names = dataset_aif_tranf_marital.feature_names)
shap.plots._waterfall.waterfall_legacy(explainer.expected_value, shap_values[0,:],feature_names=dataset_aif_tranf_marital.feature_names)
perm_marital = PermutationImportance(model).fit(dataset_aif_tranf_marital.features, dataset_aif_tranf_marital.labels.ravel())
perm_imp=eli5.show_weights(perm_marital,feature_names = dataset_aif_tranf_marital.feature_names)
perm_imp
plt.show()
#!pip install shapash
#Training Tabular Explainer
#import lime.lime_tabular
#explainer = lime.lime_tabular.LimeTabularExplainer(dataset_aif_tranf_gender.features,
# mode='classification',
# feature_names=dataset_aif_tranf_gender.feature_names,
# class_names=dataset_aif_tranf_gender.labels.ravel())
# Function features_check Extract feature names from Lime Output to be used by shapash
#def features_check(s):
# for w in list(dataset_orig_test.feature_names):
# if f' {w} ' in f' {s} ' :
# feat = w
# return feat
##%%time
# Compute local Lime Explanation for each row in Test Sample
#contrib_l=[]
#for ind in dataset_orig_test.subset(0:1000):
# exp = explainer.explain_instance(dataset_orig_test.ind.values, rf.predict_proba, num_features=dataset_orig_test.shape[1])
# contrib_l.append(dict([[features_check(elem[0]),elem[1]] for elem in exp.as_list()]))
#contribution_df =pd.DataFrame(contrib_l,index=dataset_aif_test.index)
# sorting the columns as in the original dataset
#contribution_df = contribution_df[list(dataset_aif_test.columns)]
#from shapash.explainer.smart_explainer import SmartExplainer
#xpl = SmartExplainer() # optional parameter
#xpl.compile(
#x=dataset_aif_tranf_gender.features,
#model=model,
#)
#app = xpl.run_app()